Combining Kevin’s Readme with plotly interactive graphics

Load libraries

library(data.table)
library(here)
library(tidyverse)
library(plotly)
library(janitor)
library(lubridate)
library(zoo)

Use UNIX commands to clean file

Replace <<sep>> with a special character.

# system("sed 's/<<sep>>/^/g' all_revisions_1000_articles.txt > all_revisions_1000_articles_caratseparated.txt")
# 
# cat(readLines("all_revisions_1000_articles_caratseparated.txt"), sep = "^")

Read in data

all.revisions.1000.path <- here::here("all_revisions_1000_articles_caratseparated.txt")

revisions <- read.table(all.revisions.1000.path, 
                        sep = "^",
                        quote="",
                        header=FALSE,
                        stringsAsFactors = FALSE,
                        col.names = c("article_id", "rev_id", "article_title", "timestamp", "[ip:]username", "user_id", "CATEGORY", "IMAGE", "MAIN", "TALK", "USER", "USER_TALK", "OTHER", "EXTERNAL",
      "TEMPLATE", "COMMENT", "MINOR", "TEXTDATA"),
      fill=TRUE)



# colnames(revisions) <- c("article_id", "rev_id", "article_title", "timestamp", "[ip:]username", "user_id", "CATEGORY", "IMAGE", "MAIN", "TALK", "USER", "USER_TALK", "OTHER", "EXTERNAL",
#       "TEMPLATE", "COMMENT", "MINOR", "TEXTDATA")

Clean data that we just read

revisions.processed <- revisions %>% 
  clean_names() %>% 
  mutate(timestamp = gsub("T", " ", timestamp)) %>% 
  mutate(timestamp = gsub("Z", "", timestamp)) %>% 
  mutate(timestamp = ymd_hms(timestamp)) %>% 
  mutate(year = year(timestamp)) %>% 
  mutate(month = month(timestamp)) %>% 
  mutate(year_month = as.yearmon(timestamp))
## Warning: package 'bindrcpp' was built under R version 3.4.4

Grab all categories for each article_id

categories <-  revisions.processed %>% 
  group_by(article_id) %>% 
  summarise(categories=paste(category, collapse = " "))

categories$categories <- sapply(strsplit(categories$categories, split=" "), function(x) {
  paste0(unique(trimws(x)), collapse = ', ')
})

tail(categories, 10)
## # A tibble: 10 x 2
##    article_id categories                                                  
##         <int> <chr>                                                       
##  1   14837345 ""                                                          
##  2   14848541 Polyhedral_compounds                                        
##  3   14849832 Communes_of_Isère                                           
##  4   14854290 , Stock_market_indices                                      
##  5   14909014 Denizli, National_parks_of_Turkey, Mountains_of_Turkey      
##  6   14924457 Defunct_German_football_clubs, German_football_clubs, Germa…
##  7   14929788 Unincorporated_communities_in_Indiana, Brown_County,_Indiana
##  8   14946971 ""                                                          
##  9   14949948 Villages_in_Silesian_Voivodeship                            
## 10   15057718 , Internet_properties_established_in_1997, Companies_establ…

Split up the categories listed within each row entry

categories.df <- as.data.frame(str_split_fixed(categories$categories, ", ", max(unlist(lapply(strsplit(categories$categories, ", "), length)))))

categories.df <- categories.df[,-1]
names(categories.df) <- paste0("category_", 1:ncol(categories.df))
categories.df <- cbind(article_id = categories$article_id, categories.df)

Tidy up categories

from wide to long format replace empties with NA and filter them out

categories.counts <- categories.df %>% 
  gather(category, categories, -article_id) %>%
  replace(. == "", NA) %>%
  filter(!is.na(categories)) %>%
  select(-category) %>%
  group_by(categories) %>%
  summarise(number = n()) %>%
  arrange(desc(number)) 
## Warning: attributes are not identical across measure variables;
## they will be dropped

Back to Working with Dates

create monthly counts

monthly.counts <- revisions.processed %>% 
  # select(article_id, year, month, year.mon) %>% 
  group_by(article_id, year_month) %>% 
  summarise(count = n())
  
arrange(monthly.counts, article_id, year_month)
## # A tibble: 8,862 x 3
## # Groups:   article_id [1,021]
##    article_id year_month    count
##         <int> <S3: yearmon> <int>
##  1          1 <NA>              1
##  2         47 Jan 2001          1
##  3        338 <NA>              1
##  4        559 <NA>              1
##  5        913 <NA>              1
##  6       1438 <NA>              1
##  7       1763 <NA>              1
##  8       3527 Feb 2002          1
##  9       6330 Sep 2001          1
## 10       6330 Feb 2002          1
## # ... with 8,852 more rows
p <- ggplot(data = monthly.counts, aes(x=year_month, y = count, colour = article_id)) + 
  geom_line(aes(group = article_id)) +
  geom_point() +
  ggtitle("Number of Article Revisons by Year & Month") +
  xlab("Year-Month")

p
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 6 rows containing missing values (geom_point).

ggsave("img/article-revisions-by-year-month.png", p)
## Saving 7 x 5 in image
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 6 rows containing missing values (geom_point).
ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

It could be interesting to look at which articles have such high amounts of revisions.

For now, let’s normalize by total number of revisions to get percentages.

percent.revisions <- monthly.counts %>%
  group_by(article_id) %>%
  mutate(percent = count/sum(count))

p <- ggplot(data = percent.revisions, aes(x=year_month, y = percent, colour = article_id)) + 
  geom_line(aes(group = article_id)) + 
  geom_point() +
  xlab("Year-Month") +
  ggtitle("Revisions Per Year-Month Normalized by Total Revisions")

p
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.
## Warning: Removed 6 rows containing missing values (geom_point).

ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

Plot this with first revision to get plot to prove our point about article ids and article age.

Looks like articles that were created after 2006, might not have had time to show true trends of the lifecycle of the article. Let’s only look at records that were first edited before 2006.

First Revision Date for Articles First Edited Before 2006

When was the Wiki page first revised? That information will help inform us about the behavior of a page relative to other pages created at different times.

Plot this with first revision to get plot to prove our point about article ids and article age.

Looks like articles that were created after 2006, might not have had time to show true trends of the lifecycle of the article. Let’s only look at records that were first edited before 2006.

this won’t work if the data isn’t sorted by date need date of first revision for each article id

For each id, obtain the first time it was edited.

articles.before.2006 <- percent.revisions %>% 
  mutate(year_month = as.Date(year_month, format='%b %y')) %>% 
  group_by(article_id) %>%
  summarize(first_revis_date = min(year_month))